%%capture
!pip install pyvis
!pip install bokeh
!pip install node2vec
import pandas as pd
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.manifold import TSNE
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV,RepeatedStratifiedKFold,RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from node2vec import Node2Vec
from itertools import combinations
from pyvis.network import Network
from IPython.core.display import display, HTML
import warnings
warnings.filterwarnings("ignore")
data = pd.read_csv('/content/drive/MyDrive/DA 516 - Network Project/data.csv')
print(data.shape)
data.head()
white=pd.DataFrame(data.groupby('White Player #')['Score'].count()).reset_index().rename(columns = {'White Player #':'PlayerId','Score':'MatchCount'})
black=pd.DataFrame(data.groupby('Black Player #')['Score'].count()).reset_index().rename(columns = {'Black Player #':'PlayerId','Score':'MatchCount'})
all_count = pd.concat([white,black])
all_count = pd.DataFrame(all_count.groupby('PlayerId')['MatchCount'].sum()).reset_index()
all_count = all_count[all_count.MatchCount > 1]
random_sample_players = all_count['PlayerId'].sample(2000)
test = data[(data['White Player #'].isin(random_sample_players)) & (data['Black Player #'].isin(random_sample_players))].reset_index(drop = True)
print('Test Shape :',test.shape)
print('')
print('Test distribution %:')
print(test.Score.value_counts() / len(test))
print('')
train = data[~((data['White Player #'].isin(test['White Player #'])) & (data['Black Player #'].isin(test['Black Player #'])) &
(data['Month #'].isin(test['Month #'])) & (data['Score'].isin(test['Score'])))].reset_index(drop = True)
print('Train shape :',train.shape)
print('')
print('Train distribution % :')
print(train.Score.value_counts() / len(train))
print('')
train.head()
### Checking if there is any player only in test data
train_white_players = train['White Player #'].unique().tolist()
train_black_players = train['Black Player #'].unique().tolist()
train_players = pd.concat([pd.DataFrame(train_white_players), pd.DataFrame(train_black_players)]).rename(columns = {0:'PlayerID'})
test_white_players = test['White Player #'].unique().tolist()
test_black_players = test['Black Player #'].unique().tolist()
test_players = pd.concat([pd.DataFrame(test_white_players), pd.DataFrame(test_black_players)]).rename(columns = {0:'PlayerID'})
only_test_players = test_players[~(test_players.PlayerID.isin(train_players.PlayerID))]
print('Player # have randomly selected as test which does not exist in train:',len(only_test_players))
### Deleting those players from test dataset, in order to make sure we have test players in train as well for being able to calculate features
print('Test shape before eleminating unique test players : ' , test.shape)
test = test[~(test['White Player #'].isin(only_test_players.PlayerID)) & ~(test['Black Player #'].isin(only_test_players.PlayerID))]
print('Test shape after eleminating unique test players : ' ,test.shape)
train.to_csv('/content/drive/MyDrive/DA 516 - Network Project/train.csv',index = False)
test.to_csv('/content/drive/MyDrive/DA 516 - Network Project/test.csv',index = False)
train = pd.read_csv('/content/drive/MyDrive/DA 516 - Network Project/train.csv')
test = pd.read_csv('/content/drive/MyDrive/DA 516 - Network Project/test.csv')
print(train.shape)
print(test.shape)
train.head()
train_source = train[['White Player #','Score']]
train_source['Score'] = np.where(train_source.Score == 1, 'white_win', np.where(train_source.Score == 0, 'white_lost', 'white_draw'))
train_source.rename(columns = {'White Player #':'Player'},inplace = True)
train_target = train[['Black Player #','Score']]
train_target['Score'] = np.where(train_target.Score == 0, 'black_win', np.where(train_target.Score == 1, 'black_lost', 'black_draw'))
train_target.rename(columns = {'Black Player #':'Player'},inplace = True)
train_df = pd.concat([train_source,train_target],0)
train_df.head()
player_stats = pd.DataFrame(train_df.groupby(['Player','Score'])['Score'].describe()).reset_index()
player_stats = pd.DataFrame(player_stats.pivot(index = 'Player',columns = 'Score', values = 'count').fillna(0)).reset_index()
player_stats['total_score'] = (player_stats.black_draw+player_stats.white_draw)*0.5 +(player_stats.black_win+player_stats.white_win)
player_stats['total_game_count'] = player_stats[['black_draw','black_lost','black_win','white_draw','white_lost','white_win']].sum(axis=1)
player_stats['Avg_Score'] = player_stats.total_score / player_stats.total_game_count
player_stats.head()
plt.figure(figsize=(12,6))
plt.boxplot(player_stats.Avg_Score)
plt.show()
plt.figure(figsize=(12,6))
plt.hist(player_stats.total_game_count,bins = 100)
plt.show()
more_than_hundred_game = player_stats[player_stats.total_game_count > 150]['Player'].unique()
sub_ = train[train['White Player #'].isin(more_than_hundred_game)]
sub_ = sub_[sub_['Black Player #'].isin(more_than_hundred_game)]
sub_.drop(columns ='Month #',inplace = True)
sub_.head()
train_white = sub_[sub_.Score !=0]
train_black = sub_[sub_.Score !=1]
train_black['Score'] = np.where(train_black.Score == 0,1,train_black.Score)
train_white = pd.DataFrame(train_white.groupby(['White Player #','Black Player #'])['Score'].sum()).reset_index()
train_black = pd.DataFrame(train_black.groupby(['White Player #','Black Player #'])['Score'].sum()).reset_index()
train_white.rename(columns = {'White Player #':'Source','Black Player #':'Target'},inplace = True)
train_black.rename(columns = {'White Player #':'Target','Black Player #':'Source'},inplace = True)
train_weighted = pd.concat([train_white,train_black])
train_weighted.rename(columns = {'Score': 'Weight'},inplace = True)
train_weighted = train_weighted.groupby(['Source','Target'])['Weight'].sum().reset_index()
train_weighted.head()
data_weighted = nx.from_pandas_edgelist(train_weighted, source='Source', target='Target', edge_attr=True, create_using = nx.DiGraph())
plt.rcParams['figure.figsize'] = [30, 30]
pos = nx.spring_layout(data_weighted, scale = 2, seed = 42,weight='Weight')
nx.draw_networkx_nodes(data_weighted, pos, node_color='pink', node_size=150)
nx.draw_networkx_edges(data_weighted, pos, edgelist=data_weighted.edges())
nx.draw_networkx_labels(data_weighted, pos, font_size=14, font_family="sans-serif", )
labels = nx.get_edge_attributes(data_weighted,'Weight')
nx.draw_networkx_edge_labels(data_weighted,pos,edge_labels=labels)
ax = plt.gca()
ax.margins(0.05)
plt.axis("off")
plt.tight_layout()
plt.show()
data_weighted = nx.from_pandas_edgelist(train_weighted, source='Source', target='Target', edge_attr=True, create_using = nx.DiGraph())
plt.rcParams['figure.figsize'] = [30, 30]
pos = nx.kamada_kawai_layout(data_weighted, scale = 0.5,weight='Weight')
nx.draw_networkx_nodes(data_weighted, pos, node_color='pink', node_size=150)
nx.draw_networkx_edges(data_weighted, pos, edgelist=data_weighted.edges())
nx.draw_networkx_labels(data_weighted, pos, font_size=12, font_family="sans-serif", )
labels = nx.get_edge_attributes(data_weighted,'Weight')
nx.draw_networkx_edge_labels(data_weighted,pos,edge_labels=labels)
ax = plt.gca()
ax.margins(0.05)
plt.axis("off")
plt.tight_layout()
plt.show()
tmp1 = Network(height='600px', width='70%',notebook=True,heading='Viz')
tmp1.from_nx(data_weighted)
tmp1.show_buttons(filter_=['physics'])
tmp1.show('viz.html')
display(HTML('viz.html'))
%%time
edge_list_centr = nx.from_pandas_edgelist(train, source='White Player #', target='Black Player #', create_using = nx.Graph())
dgc = nx.degree_centrality(edge_list_centr)
eig = nx.eigenvector_centrality(edge_list_centr)
pgr = nx.pagerank(edge_list_centr)
cls = nx.closeness_centrality(edge_list_centr)
btw = nx.betweenness_centrality(edge_list_centr)
cr = pd.DataFrame(index=edge_list_centr.nodes())
cr['dgc'] = cr.index.map(dgc)
cr['eig'] = cr.index.map(eig)
cr['pgr'] = cr.index.map(pgr)
cr['cls'] = cr.index.map(cls)
cr['btw'] = cr.index.map(btw)
cr.to_csv('/content/drive/MyDrive/DA 516 - Network Project/train_centrality.csv')
cr = pd.read_csv('/content/drive/MyDrive/DA 516 - Network Project/train_centrality.csv')
cr = cr.rename(columns = {'Unnamed: 0': 'Player'})
cr.head()
df=pd.DataFrame(train.groupby(['White Player #','Black Player #'])['Score'].count()).reset_index()
df.Score.sum()
G = nx.from_pandas_edgelist(df, source='White Player #', target='Black Player #', edge_attr=True, create_using = nx.Graph())
G_und = G.to_undirected(reciprocal=False)
for u, v, d in G_und.edges(data=True):
G_und[u][v]['Score'] = 0
for u, v, d in G.edges(data=True):
G_und[u][v]['Score'] += G[u][v]['Score']
for analysing network we create a undirected network
G_und = G.to_undirected(reciprocal=False)
for u, v, d in G_und.edges(data=True):
G_und[u][v]['Score'] = 0
for u, v, d in G.edges(data=True):
G_und[u][v]['Score'] += G[u][v]['Score']
print(nx.info(G_und))
import collections
import matplotlib.pyplot as plt
degree_sequence = sorted([d for n, d in G_und.degree()], reverse=True) # degree sequence
degreeCount = collections.Counter(degree_sequence)
deg, cnt = zip(*degreeCount.items())
fig, ax = plt.subplots()
plt.bar(deg, cnt, width=0.80, color="b")
plt.title("Degree Histogram")
plt.ylabel("Count")
plt.xlabel("Degree")
ax.set_xticks([d + 0.4 for d in deg])
ax.set_xticklabels(deg)
plt.show()
strengths = [G_und.degree(n, weight='Score') for n in G_und.nodes()]
plt.hist(strengths, bins = 8)
plt.show()
sum(strengths)/len(strengths)
cliques = list(nx.find_cliques(G_und))
cliques
for i in range(2,15):
cliques_gt3 = [x for x in cliques if len(x)>i]
print(i+1,len(cliques_gt3))
cliques_gt3 = [x for x in cliques if len(x)>10]
cliques_gt3
nx.density(G_und)
G_und_filtered = G_und.copy()
for u, v, d in G_und.edges(data=True):
if G_und[u][v]['Score'] < 5:
G_und_filtered.remove_edge(u, v)
Players who played with each others at least 5 times
print(nx.info(G_und))
print(nx.info(G_und_filtered))
new_cliques = list(nx.find_cliques(G_und_filtered))
new_cliques
for i in range(2,15):
cliques_gt3 = [x for x in new_cliques if len(x)>i]
print(i+1,len(cliques_gt3))
41 3 cliques who at least played with each player at least 5 times
new_cliques_gt3 = [x for x in new_cliques if len(x)>3]
new_cliques_gt3
nx.is_weakly_connected(G)
nx.is_connected(G_und)
all_components = list(nx.connected_components(G_und))
all_components
all_comp_sorted = sorted(nx.connected_components(G_und), key=len, reverse=True)
G_und_lcc = G_und.subgraph(all_comp_sorted[0])
print(nx.info(G_und_lcc))
print(nx.info(G_und))
print(nx.info(G_und_lcc))
nx.diameter(G_und_lcc)
print(nx.average_shortest_path_length(G_und_lcc))
nx.density(G_und_lcc)
print("avarage clustering: ", nx.average_clustering(G_und_lcc))
print("avarage clustering: ", nx.average_clustering(G_und))
print(nx.transitivity(G_und_lcc))
print(nx.transitivity(G_und))
from networkx.algorithms import community as nxcomm
kl_res = nxcomm.kernighan_lin_bisection(G_und, weight = 'Score')
kl_res
nxcomm.quality.modularity(G_und, communities = kl_res)
kl_res = nxcomm.kernighan_lin_bisection(G_und_lcc, weight = 'Score')
nxcomm.quality.modularity(G_und_lcc, communities = kl_res)
more_than_hundred_game = player_stats[player_stats.total_game_count > 100]['Player'].unique()
sub_ = train[train['White Player #'].isin(more_than_hundred_game)]
sub_ = sub_[sub_['Black Player #'].isin(more_than_hundred_game)]
sub_.head()
more_than_hundred_game = player_stats[player_stats.total_game_count > 100]
more_than_hundred_game.describe()
sub_.describe()
match_count_between_experienced=pd.DataFrame(sub_.groupby(['White Player #','Black Player #'])['Score'].count()).reset_index()
G = nx.from_pandas_edgelist(match_count_between_experienced, source='White Player #', target='Black Player #', edge_attr=True, create_using = nx.Graph())
match_count_between_experienced.Score.sum()
G_und = G.to_undirected(reciprocal=False)
for u, v, d in G_und.edges(data=True):
G_und[u][v]['Score'] = 0
for u, v, d in G.edges(data=True):
G_und[u][v]['Score'] += G[u][v]['Score']
print(nx.info(G_und))
import collections
import matplotlib.pyplot as plt
degree_sequence = sorted([d for n, d in G_und.degree()], reverse=True) # degree sequence
degreeCount = collections.Counter(degree_sequence)
deg, cnt = zip(*degreeCount.items())
fig, ax = plt.subplots()
plt.bar(deg, cnt, width=0.80, color="b")
plt.title("Degree Histogram")
plt.ylabel("Count")
plt.xlabel("Degree")
ax.set_xticks([d + 0.4 for d in deg])
ax.set_xticklabels(deg)
plt.show()
import matplotlib.pyplot as plt
import networkx as nx
degrees = [G_und.degree(n) for n in G_und.nodes()]
plt.hist(degrees, bins=8)
plt.show()
strengths = [G_und.degree(n, weight='Score') for n in G_und.nodes()]
plt.hist(strengths, bins = 8)
plt.show()
nx.density(G_und)
cliques = list(nx.find_cliques(G_und))
cliques
for i in range(2,15):
cliques_gt3 = [x for x in cliques if len(x)>i]
print(i+1,len(cliques_gt3))
cliques_gt10= [x for x in cliques if len(x)>10]
cliques_gt10
G_und_filtered = G_und.copy()
for u, v, d in G_und.edges(data=True):
if G_und[u][v]['Score'] < 5:
G_und_filtered.remove_edge(u, v)
print(nx.info(G_und))
print(nx.info(G_und_filtered))
new_cliques = list(nx.find_cliques(G_und_filtered))
new_cliques
for i in range(2,15):
cliques_gt3 = [x for x in new_cliques if len(x)>i]
print(i+1,len(cliques_gt3))
new_cliques_gt3 = [x for x in new_cliques if len(x)>3]
new_cliques_gt3
nx.is_weakly_connected(G)
nx.diameter(G_und)
print("avarage clustering: ", nx.average_clustering(G_und))
print(nx.transitivity(G_und))
print(nx.average_shortest_path_length(G_und))
from networkx.algorithms import community as nxcomm
kl_res = nxcomm.kernighan_lin_bisection(G_und, weight = 'Score')
nxcomm.quality.modularity(G_und, communities = kl_res)
pos = nx.spring_layout(G_und) #calculate position for each node
# pos is needed because we are going to draw a few nodes at a time,
# pos fixes their positions.
# Notice that the pos dict is passed to each call to draw below
# Draw the graph, but don't color the nodes
nx.draw(G_und, pos, edge_color='k', with_labels=True,
font_weight='light', node_size= 280, width= 0.9)
#For each community list, draw the nodes, giving it a specific color.
nx.draw_networkx_nodes(G_und, pos, nodelist=kl_res[0], node_color='b')
nx.draw_networkx_nodes(G_und, pos, nodelist=kl_res[1], node_color='r')
player_stats.describe()
more_than_fifthy_game = player_stats[player_stats.total_game_count > 50]
more_than_fifthy_game.describe()
more_than_fifthy_game_best = more_than_fifthy_game[more_than_fifthy_game.Avg_Score > 0.590833]
more_than_fifthy_game_best.describe()
best_players = more_than_fifthy_game_best['Player'].unique()
sub_ = train[train['White Player #'].isin(best_players)]
sub_ = sub_[sub_['Black Player #'].isin(best_players)]
sub_.head()
match_count_between_best=pd.DataFrame(sub_.groupby(['White Player #','Black Player #'])['Score'].count()).reset_index()
G = nx.from_pandas_edgelist(match_count_between_best, source='White Player #', target='Black Player #', edge_attr=True, create_using = nx.DiGraph())
G_und = G.to_undirected(reciprocal=False)
nx.reciprocity(G)
G_und = G.to_undirected(reciprocal=False)
for u, v, d in G_und.edges(data=True):
G_und[u][v]['Score'] = 0
for u, v, d in G.edges(data=True):
G_und[u][v]['Score'] += G[u][v]['Score']
print(nx.info(G_und))
nx.density(G_und)
cliques = list(nx.find_cliques(G_und))
cliques
cliques_gt5= [x for x in cliques if len(x)>5]
cliques_gt5
G_und_filtered = G_und.copy()
for u, v, d in G_und.edges(data=True):
if G_und[u][v]['Score'] < 4:
G_und_filtered.remove_edge(u, v)
print(nx.info(G_und))
print(nx.info(G_und_filtered))
new_cliques = list(nx.find_cliques(G_und_filtered))
new_cliques
new_cliques_gt3 = [x for x in new_cliques if len(x)>2]
new_cliques_gt3
import collections
import matplotlib.pyplot as plt
degree_sequence = sorted([d for n, d in G_und.degree()], reverse=True) # degree sequence
degreeCount = collections.Counter(degree_sequence)
deg, cnt = zip(*degreeCount.items())
fig, ax = plt.subplots()
plt.bar(deg, cnt, width=0.80, color="b")
plt.title("Degree Histogram")
plt.ylabel("Count")
plt.xlabel("Degree")
ax.set_xticks([d + 0.4 for d in deg])
ax.set_xticklabels(deg)
plt.show()
import matplotlib.pyplot as plt
import networkx as nx
degrees = [G_und.degree(n) for n in G_und.nodes()]
plt.hist(degrees, bins=8)
plt.show()
strengths = [G_und.degree(n, weight='Score') for n in G_und.nodes()]
plt.hist(strengths, bins = 8)
plt.show()
nx.diameter(G_und)
print("avarage clustering: ", nx.average_clustering(G_und))
print(nx.transitivity(G_und))
print(nx.average_shortest_path_length(G_und))
from networkx.algorithms import community as nxcomm
kl_res = nxcomm.kernighan_lin_bisection(G_und, weight = 'Score')
nxcomm.quality.modularity(G_und, communities = kl_res)
pos = nx.spring_layout(G_und) #calculate position for each node
# pos is needed because we are going to draw a few nodes at a time,
# pos fixes their positions.
# Notice that the pos dict is passed to each call to draw below
# Draw the graph, but don't color the nodes
nx.draw(G_und, pos, edge_color='k', with_labels=True,
font_weight='light', node_size= 280, width= 0.9)
#For each community list, draw the nodes, giving it a specific color.
nx.draw_networkx_nodes(G_und, pos, nodelist=kl_res[0], node_color='b')
nx.draw_networkx_nodes(G_und, pos, nodelist=kl_res[1], node_color='r')
train.drop(columns ={'Unnamed: 0'},inplace = True)
train_source = train[['White Player #','Score']]
train_source['Score'] = np.where(train_source.Score == 1, 'white_win', np.where(train_source.Score == 0, 'white_lost', 'white_draw'))
train_source.rename(columns = {'White Player #':'Player'},inplace = True)
train_target = train[['Black Player #','Score']]
train_target['Score'] = np.where(train_target.Score == 0, 'black_win', np.where(train_target.Score == 1, 'black_lost', 'black_draw'))
train_target.rename(columns = {'Black Player #':'Player'},inplace = True)
train_df = pd.concat([train_source,train_target],0)
train_df.head()
player_stats = pd.DataFrame(train_df.groupby(['Player','Score'])['Score'].describe()).reset_index()
player_stats = pd.DataFrame(player_stats.pivot(index = 'Player',columns = 'Score', values = 'count').fillna(0)).reset_index()
player_stats['total_score'] = (player_stats.black_draw+player_stats.white_draw)*0.5 +(player_stats.black_win+player_stats.white_win)
player_stats['total_game_count'] = player_stats[['black_draw','black_lost','black_win','white_draw','white_lost','white_win']].sum(axis=1)
player_stats['Avg_Score'] = player_stats.total_score / player_stats.total_game_count
player_stats.head()
train_white = train[train.Score !=0]
train_black = train[train.Score !=1]
train_black['Score'] = np.where(train_black.Score == 0,1,train_black.Score)
train_white = pd.DataFrame(train_white.groupby(['White Player #','Black Player #'])['Score'].sum()).reset_index()
train_black = pd.DataFrame(train_black.groupby(['White Player #','Black Player #'])['Score'].sum()).reset_index()
train_white.rename(columns = {'White Player #':'Source','Black Player #':'Target'},inplace = True)
train_black.rename(columns = {'White Player #':'Target','Black Player #':'Source'},inplace = True)
train_weighted = pd.concat([train_white,train_black])
train_weighted.rename(columns = {'Score': 'Weight'},inplace = True)
train_weighted = train_weighted.groupby(['Source','Target'])['Weight'].sum().reset_index()
train_weighted.head()
data_weighted = nx.from_pandas_edgelist(train_weighted, source='Source', target='Target', edge_attr=True, create_using = nx.Graph())
nx.reciprocity(data_weighted)
Between players not may sweeps %64 percent got et least draw from their oponents in games between them so it is exteremely hard to predict correct result!!!
dgc = nx.degree_centrality(data_weighted)
eig = nx.eigenvector_centrality(data_weighted)
pgr = nx.pagerank(data_weighted)
cls = nx.closeness_centrality(data_weighted)
btw = nx.betweenness_centrality(data_weighted)
player_stats['dgc'] = player_stats.Player.map(dgc)
player_stats['eig'] = player_stats.Player.map(eig)
player_stats['pgr'] = player_stats.Player.map(pgr)
player_stats['cls'] = player_stats.Player.map(cls)
player_stats['btw'] = player_stats.Player.map(btw)
# Correlation table
sns.set(font_scale = 1.25)
correlation_matrix = player_stats.corr()
plt.figure(figsize=(13,13))
ax = sns.heatmap(correlation_matrix, vmax=1, cbar=True, square=True, annot=True, fmt='.2f',
annot_kws={'size': 12}, cmap='coolwarm')
ax.xaxis.set_ticks_position('top')
plt.yticks(rotation=0)
plt.xticks(rotation=90)
plt.show()
player_stats.describe()
WEIGHTED NETWORK BETWEEN EXPREINCED PLAYERS
more_than_hundred_game = player_stats[player_stats.total_game_count > 100]['Player'].unique()
sub_ = train_weighted[train_weighted['Source'].isin(more_than_hundred_game)]
sub_ = sub_[sub_['Target'].isin(more_than_hundred_game)]
sub_.head()
sub_.Weight.sum()
data_experienced= nx.from_pandas_edgelist(sub_, source='Source', target='Target', edge_attr=True, create_using = nx.DiGraph())
nx.reciprocity(data_experienced)
Between exprienced players not may sweeps %80 percent got et least draw from their oponents in games between them
more_than_fifthy_game = player_stats[player_stats.total_game_count > 50]
more_than_fifthy_game_best = more_than_fifthy_game[more_than_fifthy_game.Avg_Score > 0.590833]['Player'].unique()
sub_ = train_weighted[train_weighted['Source'].isin(more_than_fifthy_game_best)]
sub_ = sub_[sub_['Target'].isin(more_than_fifthy_game_best)]
data_best= nx.from_pandas_edgelist(sub_, source='Source', target='Target', edge_attr=True, create_using = nx.DiGraph())
nx.reciprocity(data_best)
Between best players not may sweeps also %80 percent got et least draw from their oponents in games between them
edge_list = nx.from_pandas_edgelist(train, source='White Player #', target='Black Player #',create_using = nx.Graph())
n2v = Node2Vec(edge_list, dimensions=20, walk_length=50, num_walks=500, p = 1, q = 1)
model = n2v.fit(window=10)
model.wv.save_word2vec_format('/content/drive/MyDrive/DA 516 - Network Project/train_embs.txt')
embs = pd.read_csv('/content/drive/MyDrive/DA 516 - Network Project/train_embs.txt', sep=' ', skiprows=1, header=None, index_col = 0)
embs.head()
player_stats['total_score_bin'] = pd.qcut(player_stats['total_score'], 3, labels=["low", "medium", "high"])
player_stats.groupby('total_score_bin')['total_score'].describe()
pipe_tsne = Pipeline([
("scaler", StandardScaler()),
("tsne", TSNE(n_components=2))
])
embeded_tsne = pipe_tsne.fit_transform(embs)
plt.rcParams['figure.figsize'] = [10, 10]
sns.scatterplot(x=embeded_tsne[:, 0], y=embeded_tsne[:,1], hue = player_stats['total_score_bin']);
candidate_nodes = list(embs.index)
all_pairs = combinations(candidate_nodes, 2)
similarity = []
pair0 = []
pair1 = []
for p in all_pairs:
pair0.append(p[0])
pair1.append(p[1])
similarity.append(model.wv.similarity(str(p[0]), str(p[1])))
d = {'similarity':similarity, 'player_1': pair0, 'player_2': pair1 }
similarity_df = pd.DataFrame(d)
similarity_df.to_csv('/content/drive/MyDrive/DA 516 - Network Project/train_similarity.csv',index = False)
similarity = pd.read_csv('/content/drive/MyDrive/DA 516 - Network Project/train_similarity.csv')
similarity.head()
train_df_1 = train.merge(similarity,how = 'left', left_on = ['White Player #','Black Player #'], right_on =['player_1','player_2'])
train_df_1.dropna(inplace = True)
train_df_2 = train.merge(similarity,how = 'left', left_on = ['White Player #','Black Player #'], right_on =['player_2','player_1'])
train_df_2.dropna(inplace = True)
train_df = pd.concat([train_df_1,train_df_2],0)
train_df.drop(columns = ['player_1','player_2'],inplace = True)
del train_df_1,train_df_2
train_df['Score_names'] = np.where(train_df.Score == 1, 'white_wins', np.where(train_df.Score==0,'black_wins','draw'))
train_df.head()
sns.boxplot(x='similarity',y='Score_names',data = train_df,showmeans=True);
del train_df
## player statistics calculated on train data
train_df = train.merge(player_stats,how = 'left', left_on = 'White Player #',right_on = 'Player').drop(columns = 'Player')
train_df = train_df.rename(columns = {'black_draw':'White_Player_black_draw','black_lost':'White_Player_black_lost',
'black_win':'White_Player_black_win', 'white_draw':'White_Player_white_draw',
'white_lost': 'White_Player_white_lost','white_win': 'White_Player_white_win', 'total_score': 'White_Player_total_score',
'total_game_count': 'White_Player_total_game_count', 'Avg_Score': 'White_Player_Avg_Score'})
train_df = train_df.merge(player_stats,how = 'left', left_on = 'Black Player #',right_on = 'Player').drop(columns = 'Player')
train_df = train_df.rename(columns = {'black_draw':'Black_Player_black_draw','black_lost':'Black_Player_black_lost',
'black_win':'Black_Player_black_win', 'white_draw':'Black_Player_white_draw',
'white_lost': 'Black_Player_white_lost','white_win': 'Black_Player_white_win', 'total_score': 'Black_Player_total_score',
'total_game_count': 'Black_Player_total_game_count', 'Avg_Score': 'Black_Player_Avg_Score'})
test_df = test.merge(player_stats,how = 'left', left_on = 'White Player #',right_on = 'Player').drop(columns = 'Player')
test_df = test_df.rename(columns = {'black_draw':'White_Player_black_draw','black_lost':'White_Player_black_lost',
'black_win':'White_Player_black_win', 'white_draw':'White_Player_white_draw',
'white_lost': 'White_Player_white_lost','white_win': 'White_Player_white_win', 'total_score': 'White_Player_total_score',
'total_game_count': 'White_Player_total_game_count', 'Avg_Score': 'White_Player_Avg_Score'})
test_df = test_df.merge(player_stats,how = 'left', left_on = 'Black Player #',right_on = 'Player').drop(columns = 'Player')
test_df = test_df.rename(columns = {'black_draw':'Black_Player_black_draw','black_lost':'Black_Player_black_lost',
'black_win':'Black_Player_black_win', 'white_draw':'Black_Player_white_draw',
'white_lost': 'Black_Player_white_lost','white_win': 'Black_Player_white_win', 'total_score': 'Black_Player_total_score',
'total_game_count': 'Black_Player_total_game_count', 'Avg_Score': 'Black_Player_Avg_Score'})
print(test_df.shape)
print(train_df.shape)
train_df.head()
## player centralities calculated on train data
train_df = train_df.merge(cr, how ='left', left_on = 'White Player #',right_on = 'Player')
train_df = train_df.rename(columns = {'dgc':'White_dgc', 'eig':'White_eig', 'pgr':'White_pgr','cls':'White_cls','btw':'White_btw'})
train_df.drop(columns = 'Player',inplace = True)
train_df = train_df.merge(cr, how ='left', left_on = 'Black Player #',right_on = 'Player')
train_df = train_df.rename(columns = {'dgc':'Black_dgc', 'eig':'Black_eig', 'pgr':'Black_pgr','cls':'Black_cls','btw':'Black_btw'})
train_df.drop(columns = 'Player',inplace = True)
test_df = test_df.merge(cr, how ='left', left_on = 'White Player #',right_on = 'Player')
test_df = test_df.rename(columns = {'dgc':'White_dgc', 'eig':'White_eig', 'pgr':'White_pgr','cls':'White_cls','btw':'White_btw'})
test_df.drop(columns = 'Player',inplace = True)
test_df = test_df.merge(cr, how ='left', left_on = 'Black Player #',right_on = 'Player')
test_df = test_df.rename(columns = {'dgc':'Black_dgc', 'eig':'Black_eig', 'pgr':'Black_pgr','cls':'Black_cls','btw':'Black_btw'})
test_df.drop(columns = 'Player',inplace = True)
print(test_df.shape)
print(train_df.shape)
train_df.head()
## player embedings calculated on train data
train_df = train_df.merge(embs, how ='left', left_on = 'White Player #',right_on = embs.index )
train_df = train_df.merge(embs, how ='left', left_on = 'Black Player #',right_on = embs.index )
test_df = test_df.merge(embs, how ='left', left_on = 'White Player #',right_on = embs.index )
test_df = test_df.merge(embs, how ='left', left_on = 'Black Player #',right_on = embs.index )
print(test_df.shape)
print(train_df.shape)
train_df.head()
## player similarities calculated on train data
train_df_1 = train_df.merge(similarity,how = 'left', left_on = ['White Player #','Black Player #'], right_on =['player_1','player_2'])
train_df_1.dropna(inplace = True)
train_df_2 = train_df.merge(similarity,how = 'left', left_on = ['White Player #','Black Player #'], right_on =['player_2','player_1'])
train_df_2.dropna(inplace = True)
train_df = pd.concat([train_df_1,train_df_2],0)
train_df.drop(columns = ['player_1','player_2'],inplace = True)
del train_df_1,train_df_2
test_df_1 = test_df.merge(similarity,how = 'left', left_on = ['White Player #','Black Player #'], right_on =['player_1','player_2'])
test_df_1.dropna(inplace = True)
test_df_2 = test_df.merge(similarity,how = 'left', left_on = ['White Player #','Black Player #'], right_on =['player_2','player_1'])
test_df_2.dropna(inplace = True)
test_df = pd.concat([test_df_1,test_df_2],0)
test_df.drop(columns = ['player_1','player_2'],inplace = True)
del test_df_1,test_df_2
print(test_df.shape)
print(train_df.shape)
train_df.head()
train_df.to_csv('/content/drive/MyDrive/DA 516 - Network Project/train_df.csv',index = False)
test_df.to_csv('/content/drive/MyDrive/DA 516 - Network Project/test_df.csv',index = False)
train_df = pd.read_csv('/content/drive/MyDrive/DA 516 - Network Project/train_df.csv')
test_df = pd.read_csv('/content/drive/MyDrive/DA 516 - Network Project/test_df.csv')
print(train_df.shape)
print(test_df.shape)
train_df.head()
features = train_df.drop(columns = ['White Player #', 'Black Player #', 'Score']).columns.to_list()
train_df['target'] = np.where(train_df.Score == 1,1 , np.where(train_df.Score == 0, 2,3 ))
pipeline = Pipeline(steps=[('preprocessor', StandardScaler()),
('classifier', LogisticRegression(random_state=42))])
params = [{'classifier__C' : np.arange(0.1, 5.0, 0.5),
'classifier__penalty': ['l1'],
'classifier__solver' : ['liblinear','saga']},
{'classifier__C' : np.arange(0.1, 5.0, 0.5),
'classifier__penalty': ['l2'],
'classifier__solver' : ['liblinear', 'newton-cg', 'lbfgs', 'sag', 'saga']}
]
met = ['accuracy', 'f1_weighted', 'precision_weighted', 'recall_weighted']
cv = RepeatedStratifiedKFold(n_splits=4, n_repeats=3,random_state=42)
log_grid = RandomizedSearchCV(pipeline,param_distributions = params,cv = cv,verbose = 1, n_jobs = -1, scoring = met, refit = 'f1_weighted',n_iter =30)
log_grid.fit(train_df[features], train_df.target)
print(log_grid.best_params_)
print(log_grid.best_score_)
print('Logistic Regression Cross Validation Results')
print('')
print('Mean Accuracy Score :',log_grid.cv_results_['mean_test_accuracy'].mean())
print('='*20)
print('Mean f1-weighted Score :' ,log_grid.cv_results_['mean_test_f1_weighted'].mean())
print('='*20)
print('Mean Precision Score Precision :',log_grid.cv_results_['mean_test_precision_weighted'].mean())
print('='*20)
print('Mean Recall Score :',log_grid.cv_results_['mean_test_recall_weighted'].mean())
test_df['target'] = np.where(test_df.Score == 1,1 , np.where(test_df.Score == 0, 2,3 ))
y_pred_log = log_grid.predict(test_df[features])
target_names = ['White_Wins', 'Black_Wins', 'Draw']
print(classification_report(test_df['target'], y_pred_log, target_names=target_names))
confusion_matrix(test_df['target'], y_pred_log)
param_grid = {
'clf__n_estimators' : [100,300],
'clf__criterion' :['gini','entropy'],
'clf__max_depth' : [3,5,9],
'clf__max_features' :['auto'],
'clf__min_samples_leaf' :[30,50,100],
'clf__min_samples_split' : [30,50,100],
'clf__class_weight' :['balanced',None],
'clf__bootstrap': [False]
}
pipeline = Pipeline(steps=[('preprocessor', StandardScaler()),('clf',RandomForestClassifier(random_state=42))])
cv = RepeatedStratifiedKFold(n_splits=4, n_repeats=3,random_state=42)
grid_rdf = RandomizedSearchCV(pipeline,param_distributions = param_grid,cv = cv,verbose = 1, n_jobs = -1, scoring = met, refit = 'f1_weighted',n_iter =30)
grid_rdf.fit(train_df[features], train_df.target)
print(grid_rdf.best_params_)
print(grid_rdf.best_score_)
print('Random Forest Cross Validation Results')
print('')
print('Mean Accuracy Score :',grid_rdf.cv_results_['mean_test_accuracy'].mean())
print('='*20)
print('Mean f1-weighted Score :' ,grid_rdf.cv_results_['mean_test_f1_weighted'].mean())
print('='*20)
print('Mean Precision Score Precision :',grid_rdf.cv_results_['mean_test_precision_weighted'].mean())
print('='*20)
print('Mean Recall Score :',grid_rdf.cv_results_['mean_test_recall_weighted'].mean())
test_df['target'] = np.where(test_df.Score == 1,1 , np.where(test_df.Score == 0, 2,3 ))
y_pred_rdf = grid_rdf.predict(test_df[features])
target_names = ['White_Wins', 'Black_Wins', 'Draw']
print(classification_report(test_df['target'], y_pred_rdf, target_names=target_names))
confusion_matrix(test_df['target'], y_pred_rdf)
param_grid = {
'clf__learning_rate' : [0.1],
'clf__n_estimators' : [300],
'clf__max_depth' : [2,3],
'clf__colsample_bytree' : [0.6,0.7,0.8],
'clf__subsample' : [0.6,0.7,0.8],
'clf__reg_alpha' : [0.1,0.3],
'clf__reg_lambda' : [0.5,1],
'clf__min_child_weight' : [90,110,150],
'clf__tree_method' : ['gpu_hist'],
'clf__objective' : ['multi:softmax'],
'clf__eval_metric' :['merror'],
'clf__use_label_encoder' : [False]
}
pipe = Pipeline([('scaler', StandardScaler()),('clf',XGBClassifier(random_state=42,early_stopping_rounds=10))])
met = ['accuracy', 'f1_weighted', 'precision_weighted', 'recall_weighted']
cv = RepeatedStratifiedKFold(n_splits=4, n_repeats=3,random_state=42)
xgb_grid = RandomizedSearchCV(estimator=pipe,param_distributions=param_grid,scoring=met,refit='f1_weighted',cv=cv,verbose =1, n_jobs = -1,n_iter =100)
xgb_grid.fit(train_df[features], train_df.target)
print(xgb_grid.best_score_)
print(xgb_grid.best_params_)
print('XGBoost Cross Validation Results')
print('')
print('Mean Accuracy Score :',xgb_grid.cv_results_['mean_test_accuracy'].mean())
print('='*20)
print('Mean f1-weighted Score :' ,xgb_grid.cv_results_['mean_test_f1_weighted'].mean())
print('='*20)
print('Mean Precision Score Precision :',xgb_grid.cv_results_['mean_test_precision_weighted'].mean())
print('='*20)
print('Mean Recall Score :',xgb_grid.cv_results_['mean_test_recall_weighted'].mean())
test_df['target'] = np.where(test_df.Score == 1,1 , np.where(test_df.Score == 0, 2,3 ))
y_pred_xgb = xgb_grid.predict(test_df[features])
target_names = ['White_Wins', 'Black_Wins', 'Draw']
print(classification_report(test_df['target'], y_pred_xgb, target_names=target_names))
confusion_matrix(test_df['target'], y_pred_xgb)
features_importances_xgb_grid = pd.DataFrame(xgb_grid.best_estimator_['clf'].feature_importances_)
features_importances_xgb_grid['feature'] = features
features_importances_xgb_grid = features_importances_xgb_grid.rename(columns={0:'importance'})
features_importances_xgb_grid = features_importances_xgb_grid.sort_values(by = 'importance' , ascending=False)
features_importances_xgb_grid = features_importances_xgb_grid[['feature', 'importance']]
features_importances_xgb_grid.head()
train_df['target_names'] = np.where(train_df.Score == 1 , 'White_Wins', np.where(train_df.Score == 0, 'Black_Wins', 'Draw'))
train_df.rename(columns ={'1_x':'White_Embeding_1','1_y':'Black_Embeding_1' }, inplace = True)
fig, axes = plt.subplots(3,3,figsize=(16,9))
plt.subplot(3, 3, 1)
sns.barplot(x = 'target_names', y = 'White_eig', data = train_df)
plt.subplot(3, 3, 2)
sns.barplot(x = 'target_names', y = 'Black_Player_black_draw', data = train_df)
plt.subplot(3,3,3)
sns.barplot(x = 'target_names', y = 'Black_Player_total_score', data = train_df)
plt.subplot(3, 3, 4)
sns.barplot(x = 'target_names', y = 'White_Player_total_score', data = train_df)
plt.subplot(3, 3, 5)
sns.barplot(x = 'target_names', y = 'White_cls', data = train_df)
plt.subplot(3,3,6)
sns.barplot(x = 'target_names', y = 'White_Player_Avg_Score', data = train_df)
plt.subplot(3, 3, 7)
sns.barplot(x = 'target_names', y = 'Black_eig', data = train_df)
plt.subplot(3, 3, 8)
sns.barplot(x = 'target_names', y = 'White_Embeding_1', data = train_df)
plt.subplot(3,3,9)
sns.barplot(x = 'target_names', y = 'Black_Embeding_1', data = train_df)
fig.tight_layout()
plt.show()
fig, axes = plt.subplots(3,3,figsize=(16,9))
plt.subplot(3, 3, 1)
sns.boxplot(x = 'target_names', y = 'White_eig', data = train_df)
plt.subplot(3, 3, 2)
sns.boxplot(x = 'target_names', y = 'Black_Player_black_draw', data = train_df)
plt.subplot(3,3,3)
sns.boxplot(x = 'target_names', y = 'Black_Player_total_score', data = train_df)
plt.subplot(3, 3, 4)
sns.boxplot(x = 'target_names', y = 'White_Player_total_score', data = train_df)
plt.subplot(3, 3, 5)
sns.boxplot(x = 'target_names', y = 'White_cls', data = train_df)
plt.subplot(3,3,6)
sns.boxplot(x = 'target_names', y = 'White_Player_Avg_Score', data = train_df)
plt.subplot(3, 3, 7)
sns.boxplot(x = 'target_names', y = 'Black_eig', data = train_df)
plt.subplot(3, 3, 8)
sns.boxplot(x = 'target_names', y = 'White_Embeding_1', data = train_df)
plt.subplot(3,3,9)
sns.boxplot(x = 'target_names', y = 'Black_Embeding_1', data = train_df)
fig.tight_layout()
plt.show()